This document describes how we map the checklist data to Darwin Core. The source file for this document can be found here.
Load libraries:
library(tidyverse) # Data manipulation
library(obisdi) # Tools for data ingestion for OBIS
library(here) # Get paths (important!)
library(arrow) # To deal with parquet files
library(worrms) # Taxonomy checking
This dataset was downloaded from the BioTIME1 database and can be found in this link: https://biotime.st-andrews.ac.uk/selectStudy.php?study=428. Two files are available on this link: the raw data and the metadata.
# Get the path to data/raw
raw_path <- here("data", "raw")
list.files(raw_path)
## [1] "metadata_428.csv" "raw_data_428.csv"
We first read the raw data:
dataset <- read.csv(paste0(raw_path, "/raw_data_428.csv"))
str(dataset)
## 'data.frame': 101221 obs. of 10 variables:
## $ ABUNDANCE : int 1 1 1 1 1 1 2 2 3 3 ...
## $ BIOMAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GENUS_SPECIES: chr "Anguilla anguilla" "Zoarces viviparus" "Zoarces viviparus" "Zoarces viviparus" ...
## $ SAMPLE_DESC : chr "58.0640747805_7.76405684663_15_9_2009_1_1_4_2_3_1" "58.0640747805_7.76405684663_17_9_1988_1_1_4_9_9_1" "58.0640747805_7.76405684663_12_10_1990_1_1_4_2_2_1" "58.0640747805_7.76405684663_16_9_1993_1_1_5_2_4_1" ...
## $ PLOT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 58.1 58.1 58.1 58.1 58.1 ...
## $ LONGITUDE : num 7.76 7.76 7.76 7.76 7.76 ...
## $ DAY : int 15 17 12 16 14 16 15 15 18 17 ...
## $ MONTH : int 9 9 10 9 9 9 9 9 9 9 ...
## $ YEAR : int 2009 1988 1990 1993 1995 2015 1998 2001 1989 1988 ...
head(dataset)
By reading the metadata we can see that there is no
biomass information, so we are interested in the abundance
information:
Samples are collected from the Skagerrak coast and each station (we have coordinates) is a beach seine haul (5-700m2) where all species are counted. Unit of abundance = IndCountInt, Unit of biomass = NA
We select only the relevant collumns. We also add a unique ID to each occurrence to be able to track back any transformation on data.
dataset <- select(dataset, -BIOMAS, -SAMPLE_DESC)
dataset$uniqueID <- paste0("biotime_study248_", 1:nrow(dataset))
We will need to generate two files for the submission: 1. event - describe the sampling event 2. occurrence - the occurrence of species in an event
event
tableWe start by getting each unique event. Both the occurrences and the
events table need to have the same eventID and locationID, so we still
work with the dataset object.
dataset <- dataset %>%
arrange(-DAY, -MONTH, -YEAR)
dataset <- dataset %>%
group_by(LATITUDE, LONGITUDE, PLOT, DAY, MONTH, YEAR) %>%
mutate(dwc_eventID = paste0("skag_ev_", cur_group_id())) %>%
group_by(LATITUDE, LONGITUDE, PLOT) %>%
mutate(dwc_locationID = paste0("skag_lo_", cur_group_id()))
Correct the name of Longitude/Latitude fields, and date.
dataset <- dataset %>%
mutate(dwc_decimalLatitude = LATITUDE,
dwc_decimalLongitude = LONGITUDE) %>%
mutate(dwc_eventDate = paste(YEAR, MONTH, DAY, sep = "-")) %>%
mutate(dwc_day = DAY, dwc_year = YEAR, dwc_month = MONTH)
Now we create a new object called events that will be
our events table. We add other columns that are needed:
events <- dataset %>%
mutate(dwc_type = "Event",
dwc_ownerInstitutionCode = "HAVFORSKNINGSINSTITUTTET",
dwc_samplingProtocol = "Beach seine haul (40m long, mesh size 1.5cm), nearshore (<15m depth) - all species counted.",
dwc_sampleSizeUnit = "square metre",
dwc_sampleSizeValue = "5-700",
dwc_samplingEffort = "transect",
dwc_country = "Norway",
dwc_countryCode = "NO",
dwc_locality = "Skagerrak",
dwc_datasetID = paste0("biotime_study", metadata$STUDY_ID[1]),
dwc_datasetName = "Long term monitoring of fish abundances from coastal Skagerrak.")
Get only the unique events:
events <- events %>%
ungroup() %>%
distinct(dwc_eventID, .keep_all = T)
occurrence tableWe first create a new object that will have our occurrences:
occurrences <- dataset
Then we add the needed columns:
occurrences <- occurrences %>%
mutate(dwc_type = "Event",
dwc_ownerInstitutionCode = "HAVFORSKNINGSINSTITUTTET",
dwc_occurrenceID = uniqueID,
dwc_basisOfRecord = "HumanObservation",
#dwc_scientificName = GENUS_SPECIES, # We will add it later, corrected
dwc_individualCount = ABUNDANCE,
dwc_organismQuantity = ABUNDANCE,
dwc_organismQuantityType = "individuals")
We have no information if this dataset was checked with WoRMS for consistency of taxonomy. Thus, we do a check here:
# Look into WoRMS for each name
name_checking <- wm_records_names(unique(occurrences$GENUS_SPECIES))
# Verify if there is any record with not valid names
table(unlist(lapply(name_checking, function(x){x$status})))
##
## accepted unaccepted
## 52 3
# We see that there are three unaccepted records. For these, we will extract the correct names.
# We also check if there are records for which we were unable to find information:
no_match <- unique(occurrences$GENUS_SPECIES)[unlist(lapply(name_checking, nrow)) == 0]
no_match
## [1] "Manetyngel unknown" "Polioptila caerulea" "Sygnathus typhle"
## [4] "Labrus bimaculatus"
We have some records that needs updated names and also 4 records for which there was no match. One of them is a unknown taxa and one is a non marine species2 both which we will exclude, while for the other 2 we manually correct the information:
name_checking <- lapply(name_checking, function(x){
if (nrow(x) == 0) {
x <- name_checking[[1]][1,]
x[1,] <- NA
}
x
})
name_info <- bind_rows(name_checking)
name_info <- name_info %>%
select(valid_name, valid_AphiaID, valid_authority,
kingdom, phylum, class, order, family, genus, rank) %>%
mutate(scientificName = valid_name,
scientificNameAuthorship = valid_authority,
scientificNameID = paste0("urn:lsid:marinespecies.org:taxname:", valid_AphiaID),
taxonRank = rank) %>%
select(-valid_name, -valid_authority, -valid_AphiaID, -rank)
name_info$GENUS_SPECIES <- unique(occurrences$GENUS_SPECIES)
occurrences <- occurrences %>%
filter(!GENUS_SPECIES %in% c("Manetyngel unknown", "Polioptila caerulea"))
name_info <- name_info %>%
filter(!GENUS_SPECIES %in% c("Manetyngel unknown", "Polioptila caerulea"))
name_info[name_info$GENUS_SPECIES == "Sygnathus typhle",1:10] <-
list("Animalia", "Chordata", "Teleostei", "Syngnathiformes", "Syngnathidae", "Syngnathus", "Syngnathus typhle", "(Linnaeus, 1758)",
"urn:lsid:marinespecies.org:taxname:127393", "Species")
name_info[name_info$GENUS_SPECIES == "Labrus bimaculatus",1:10] <-
list("Animalia", "Chordata", "Teleostei", NA, "Labridae", "Labrus", "Labrus mixtus", "(Linnaeus, 1758)",
"urn:lsid:marinespecies.org:taxname:151501", "Species") # According to FishBase
# Merge correct names
colnames(name_info)[1:10] <- paste0("dwc_", colnames(name_info)[1:10])
# Add originalNameUsage for the one we searched on FishBase
name_info$dwc_originalNameUsage <- NA
name_info$dwc_originalNameUsage[name_info$GENUS_SPECIES == "Labrus bimaculatus"] <-
"Labrus bimaculatus"
occurrences <- left_join(occurrences, name_info, by = "GENUS_SPECIES")
occurrences$dwc_taxonRank <- tolower(occurrences$dwc_taxonRank)
Now we check if both tables have the same number of unique
locationID and eventID:
all.equal(unique(occurrences$dwc_eventID), unique(occurrences$dwc_eventID))
## [1] TRUE
all.equal(unique(occurrences$dwc_locationID), unique(occurrences$dwc_locationID))
## [1] TRUE
Remove unused columns and change names of those with dwc_:
events <- events %>%
mutate(dwc_eventRemarks = paste("Plot", PLOT)) %>%
select(starts_with("dwc_"))
occurrences <- occurrences %>%
ungroup() %>%
select(starts_with("dwc_"))
# Change column names
colnames(events) <- str_remove(colnames(events), "dwc_")
colnames(occurrences) <- str_remove(colnames(occurrences), "dwc_")
# Remove columns from the occurrences table
occurrences <- occurrences %>%
select(-day, -month, -year, -decimalLatitude, -decimalLongitude, -eventDate,
-locationID)
We now have our final files:
occurrences
events
We export in csv format:
proc_path <- here("data", "processed")
write_csv(events, paste0(proc_path, "/events.csv"))
write_csv(occurrences, paste0(proc_path, "/occurrences.csv"))
The species Polioptila caerulea returns a bird in all our searches. It’s possible that this is a mistake when the name was written in the table.↩︎
MPA Europe project has been approved under HORIZON-CL6-2021-BIODIV-01-12 — Improved science based maritime spatial planning and identification of marine protected areas.